预处理
# 读取数据,去除 id 和 day 列
data <- read.csv("train.csv")
data_selected <- data %>% select(-id, -day)
# 分离 rainfall,其他变量做标准化处理
rainfall <- data_selected$rainfall
predictors <- data_selected %>% select(-rainfall)
predictors_scaled <- scale(predictors)
特征组合
var_names <- names(predictors)
results <- list()
# 枚举所有至少包含两个变量的组合
for (k in 2:length(var_names)) {
var_combos <- combn(var_names, k, simplify = FALSE)
for (combo in var_combos) {
subset_data <- predictors[, combo]
subset_scaled <- scale(subset_data)
set.seed(123)
km <- kmeans(subset_scaled, centers = 2)
cluster <- as.factor(km$cluster)
# 比较两种标签映射方式,取更高准确率
acc1 <- mean(cluster == rainfall)
acc2 <- mean(rev(levels(cluster))[cluster] == rainfall)
accuracy <- max(acc1, acc2)
results[[paste(combo, collapse = "+")]] <- list(
variables = combo,
accuracy = accuracy
)
}
}
# 整理并排序准确率结果
accuracy_df <- tibble(
variables = names(results),
accuracy = map_dbl(results, ~ .x$accuracy)
) %>%
arrange(desc(accuracy))
# 输出前10准确率最高的变量组合
head(accuracy_df, 10)
## # A tibble: 10 × 2
## variables accuracy
## <chr> <dbl>
## 1 cloud+winddirection 0.698
## 2 dewpoint+cloud 0.697
## 3 dewpoint+cloud+winddirection 0.697
## 4 pressure+cloud 0.697
## 5 pressure+dewpoint+cloud 0.696
## 6 cloud+windspeed 0.687
## 7 dewpoint+cloud+windspeed 0.684
## 8 pressure+cloud+windspeed 0.680
## 9 pressure+cloud+winddirection 0.679
## 10 cloud+winddirection+windspeed 0.676
k-means
Original
subset_vars <- predictors[, c("cloud", "winddirection")]
subset_scaled <- scale(subset_vars)
set.seed(123)
km <- kmeans(subset_scaled, centers = 2)
cluster <- as.factor(km$cluster)
cat("各簇大小:\n")
## 各簇大小:
print(table(cluster))
## cluster
## 1 2
## 1714 476
cat("\n簇中心(标准化变量):\n")
##
## 簇中心(标准化变量):
print(km$centers)
## cloud winddirection
## 1 0.4669527 -0.02683802
## 2 -1.6814222 0.09663942
cat("\n聚类标签与真实rainfall混淆矩阵:\n")
##
## 聚类标签与真实rainfall混淆矩阵:
print(table(Cluster = cluster, Rainfall = rainfall))
## Rainfall
## Cluster 0 1
## 1 186 1528
## 2 354 122
acc1 <- mean(cluster == rainfall)
acc2 <- mean(rev(levels(cluster))[cluster] == rainfall)
cat("\n两种映射准确率:\n")
##
## 两种映射准确率:
cat("直接匹配:", acc1, "\n")
## 直接匹配: 0.6977169
cat("标签反转:", acc2, "\n")
## 标签反转: 0.05570776
plot_df <- as.data.frame(subset_scaled) %>%
mutate(Cluster = cluster, Rainfall = as.factor(rainfall))
ggplot(plot_df, aes(x = cloud, y = winddirection, color = Cluster, shape = Rainfall)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "Cloud和Winddirection的K-means聚类结果",
x = "Cloud (标准化)", y = "Winddirection (标准化)") +
theme_minimal()

PCA
set.seed(123)
kmeans_result <- kmeans(predictors_scaled, centers = 2)
# PCA降维到2维和3维
pca_sample <- prcomp(predictors_scaled, center = FALSE, scale. = FALSE)
pca_2d <- as.data.frame(pca_sample$x[, 1:2])
pca_3d <- as.data.frame(pca_sample$x[, 1:3])
names(pca_2d) <- c("PC1", "PC2")
pca_2d$cluster <- as.factor(kmeans_result$cluster)
pca_2d$rainfall <- as.factor(rainfall)
# 创建聚类与雨量标签组合变量
pca_2d <- pca_2d %>%
mutate(combo = case_when(
cluster == "1" & rainfall == "1" ~ "聚类1_下雨",
cluster == "1" & rainfall == "0" ~ "聚类1_不下雨",
cluster == "2" & rainfall == "1" ~ "聚类2_下雨",
cluster == "2" & rainfall == "0" ~ "聚类2_不下雨"
))
color_map <- c(
"聚类1_下雨" = "#1f77b4",
"聚类1_不下雨" = "#d62728",
"聚类2_下雨" = "#ff7f0e",
"聚类2_不下雨" = "purple"
)
ggplot(pca_2d, aes(x = PC1, y = PC2, color = combo)) +
geom_point(size = 2) +
scale_color_manual(values = color_map) +
theme_minimal() +
labs(title = "K-means聚类与雨量组合的二维PCA散点图",
color = "组合类别")

# 对标准化变量做PCA
pca_variable <- prcomp(predictors_scaled, center = FALSE, scale. = FALSE)
# 查看主成分方差解释情况
summary(pca_variable)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3466 1.4770 0.92917 0.70666 0.64952 0.49454 0.42164
## Proportion of Variance 0.5507 0.2181 0.08633 0.04994 0.04219 0.02446 0.01778
## Cumulative Proportion 0.5507 0.7688 0.85514 0.90507 0.94726 0.97172 0.98950
## PC8 PC9 PC10
## Standard deviation 0.2627 0.16686 0.09041
## Proportion of Variance 0.0069 0.00278 0.00082
## Cumulative Proportion 0.9964 0.99918 1.00000
# 提取前三个主成分的变量载荷(贡献)
loadings <- as.data.frame(pca_variable$rotation[, 1:3])
loadings$var <- rownames(loadings)
# 构建3D箭头起止点数据,用于绘制变量投影线
arrow_lines <- data.frame(
x = c(rbind(0, loadings$PC1)),
y = c(rbind(0, loadings$PC2)),
z = c(rbind(0, loadings$PC3)),
group = rep(1:nrow(loadings), each = 2)
)
# 绘制变量在前三主成分空间的3D投影箭头图
plot_ly() %>%
add_trace(
data = arrow_lines,
type = "scatter3d",
mode = "lines",
x = ~x, y = ~y, z = ~z,
split = ~group,
line = list(color = 'gray', width = 4),
showlegend = FALSE
) %>%
add_trace(
data = loadings,
type = "scatter3d",
mode = "text",
x = ~PC1, y = ~PC2, z = ~PC3,
text = ~var,
textposition = "top center",
textfont = list(size = 12),
showlegend = FALSE
) %>%
layout(
title = "变量主成分投影(3D箭头图)",
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
)
)
# 添加降雨状态标签到3D PCA数据
rainfall_factor <- factor(rainfall, levels = c(0,1), labels = c("不下雨", "下雨"))
pca_3d$rainfall <- rainfall_factor
# 绘制样本在前三主成分空间中的3D散点图,颜色区分降雨状态
fig <- plot_ly(
pca_3d,
x = ~PC1, y = ~PC2, z = ~PC3,
color = ~rainfall,
colors = c("#1f77b4", "#d62728"),
type = 'scatter3d',
mode = 'markers',
marker = list(size = 3, opacity = 0.8)
) %>%
layout(
title = "样本主成分空间中的降雨状态",
scene = list(
xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')
),
legend = list(title = list(text = "<b>降雨状态</b>"))
)
fig
pca_top3 <- as.data.frame(pca_variable$x[, 1:3])
set.seed(123)
km_pca <- kmeans(pca_top3, centers = 2)
cluster <- as.factor(km_pca$cluster)
# 计算两种标签映射的准确率,取最大值
acc1 <- mean(cluster == rainfall)
acc2 <- mean(rev(levels(cluster))[cluster] == rainfall)
accuracy <- max(acc1, acc2)
cat("使用前三主成分进行K-means聚类的准确率:", round(accuracy, 4), "\n")
## 使用前三主成分进行K-means聚类的准确率: 0.4438
因子分析
# 使用平行分析辅助确定因子数量
fa.parallel(predictors_scaled, fa = "fa")

## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
# 进行因子分析,提取3个因子,旋转方式为varimax,估计方法为最大似然
fa_result <- fa(predictors_scaled, nfactors = 3, rotate = "varimax", fm = "ml")
# 查看因子载荷矩阵(只显示载荷大于0.3的项)
print(fa_result$loadings, cutoff = 0.3)
##
## Loadings:
## ML1 ML2 ML3
## pressure -0.834
## maxtemp 0.962
## temparature 0.985
## mintemp 0.984
## dewpoint 0.960
## humidity 0.695
## cloud 0.906
## sunshine -0.841
## winddirection 0.670
## windspeed -0.322
##
## ML1 ML2 ML3
## SS loadings 5.139 2.126 0.113
## Proportion Var 0.514 0.213 0.011
## Cumulative Var 0.514 0.726 0.738
# 查看部分因子得分
head(fa_result$scores)
## ML1 ML2 ML3
## [1,] -0.4947246 0.8069461 0.8239284
## [2,] -1.2090124 1.0603347 1.1394742
## [3,] -1.7987775 -1.8475852 -0.1952701
## [4,] -0.9655903 1.2514258 1.1623847
## [5,] -1.5022529 -1.9457799 -1.9429474
## [6,] -1.1017496 0.1098068 -1.1701963
# 将因子得分转换为数据框,便于后续聚类或分析
factor_scores <- as.data.frame(fa_result$scores)
# 先做10因子因子分析,查看方差解释比例
fa_result <- fa(predictors_scaled, nfactors = 10, rotate = "varimax", fm = "ml")
fa_result$Vaccounted
## ML1 ML2 ML5 ML4 ML3
## SS loadings 5.0033893 2.0102884 0.23655106 0.079292109 0.064380994
## Proportion Var 0.5003389 0.2010288 0.02365511 0.007929211 0.006438099
## Cumulative Var 0.5003389 0.7013678 0.72502287 0.732952081 0.739390181
## Proportion Explained 0.6766913 0.2718846 0.03199272 0.010723987 0.008707310
## Cumulative Proportion 0.6766913 0.9485760 0.98056870 0.991292690 1.000000000
## ML6 ML8 ML7 ML9
## SS loadings 7.583910e-30 7.583910e-30 7.583910e-30 7.583909e-30
## Proportion Var 7.583910e-31 7.583910e-31 7.583910e-31 7.583909e-31
## Cumulative Var 7.393902e-01 7.393902e-01 7.393902e-01 7.393902e-01
## Proportion Explained 1.025698e-30 1.025698e-30 1.025698e-30 1.025698e-30
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## ML10
## SS loadings 7.583908e-30
## Proportion Var 7.583908e-31
## Cumulative Var 7.393902e-01
## Proportion Explained 1.025698e-30
## Cumulative Proportion 1.000000e+00
# 再选3因子做因子分析
fa_result <- fa(predictors_scaled, nfactors = 3, rotate = "varimax", fm = "ml")
# 提取因子载荷矩阵,转为数据框
loadings_matrix <- as.data.frame(unclass(fa_result$loadings))
loadings_matrix$Variable <- rownames(loadings_matrix)
# 转换为长格式方便绘图
loadings_long <- melt(loadings_matrix, id.vars = "Variable",
variable.name = "Factor", value.name = "Loading")
# 绘制因子载荷热力图,含载荷数值标签
ggplot(loadings_long, aes(x = Factor, y = Variable, fill = Loading)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Loading, 2)), size = 4) +
scale_fill_gradient2(low = "#d7191c", high = "#1a9641", mid = "white",
midpoint = 0, limit = c(-1, 1), name = "载荷值") +
theme_minimal(base_size = 14) +
labs(title = "因子载荷热力图(含数值)", x = "因子", y = "变量") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 提取前两个因子的因子得分
fa_result <- fa(predictors_scaled, nfactors = 2, rotate = "varimax", fm = "ml")
factor_scores <- as.data.frame(fa_result$scores[, 1:2])
names(factor_scores) <- c("Factor1", "Factor2")
# 对因子得分进行K-means聚类(2类)
set.seed(123)
kmeans_result <- kmeans(factor_scores, centers = 2)
# 添加聚类标签和实际rainfall标签
factor_scores$cluster <- as.factor(kmeans_result$cluster)
factor_scores$rainfall <- as.factor(rainfall)
# 查看聚类结果与真实降雨的对应关系
table(聚类 = factor_scores$cluster, 实际下雨 = factor_scores$rainfall)
## 实际下雨
## 聚类 0 1
## 1 116 1395
## 2 424 255
# 输出聚类中心
kmeans_result$centers
## Factor1 Factor2
## 1 -0.08083072 0.5515965
## 2 0.17987514 -1.2274850
factor_scores <- factor_scores %>%
mutate(
label = case_when(
cluster == "1" & rainfall == "1" ~ "聚类1_下雨",
cluster == "1" & rainfall == "0" ~ "聚类1_不下雨",
cluster == "2" & rainfall == "1" ~ "聚类2_下雨",
cluster == "2" & rainfall == "0" ~ "聚类2_不下雨"
)
)
label_colors <- c(
"聚类1_下雨" = "#1f77b4",
"聚类1_不下雨" = "#d62728",
"聚类2_下雨" = "#ff7f0e",
"聚类2_不下雨" = "purple"
)
ggplot(factor_scores, aes(x = Factor1, y = Factor2, color = label)) +
geom_point(size = 2.5, alpha = 0.8) +
scale_color_manual(values = label_colors) +
labs(
title = "因子空间中 K-means 聚类与实际rainfall状态对比",
x = "Factor 1",
y = "Factor 2",
color = "聚类 + 实际"
) +
theme_minimal()

模型评估
# 构建数据框(标准化预测变量 + 因变量)
logit_data <- as.data.frame(predictors_scaled)
logit_data$rainfall <- as.factor(rainfall)
# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(logit_data)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- logit_data[train_index, ]
test_data <- logit_data[-train_index, ]
# 训练逻辑回归模型
logit_model <- glm(rainfall ~ ., data = train_data, family = binomial)
# 训练集预测及分类
predicted_prob_train <- predict(logit_model, newdata = train_data, type = "response")
actual_train <- as.numeric(as.character(train_data$rainfall))
predicted_class_train <- ifelse(predicted_prob_train > 0.5, 1, 0)
# 测试集预测及分类
predicted_prob_test <- predict(logit_model, newdata = test_data, type = "response")
actual_test <- as.numeric(as.character(test_data$rainfall))
predicted_class_test <- ifelse(predicted_prob_test > 0.5, 1, 0)
# 训练集混淆矩阵和准确率
cat("\n=== 训练集性能 ===\n")
##
## === 训练集性能 ===
confusion_matrix_train <- table(Predicted = predicted_class_train, Actual = actual_train)
print(confusion_matrix_train)
## Actual
## Predicted 0 1
## 0 288 77
## 1 147 1240
accuracy_train <- mean(predicted_class_train == actual_train)
cat("训练集准确率:", round(accuracy_train, 4), "\n")
## 训练集准确率: 0.8721
# 测试集混淆矩阵和准确率
cat("\n=== 测试集性能 ===\n")
##
## === 测试集性能 ===
confusion_matrix_test <- table(Predicted = predicted_class_test, Actual = actual_test)
print(confusion_matrix_test)
## Actual
## Predicted 0 1
## 0 65 27
## 1 40 306
accuracy_test <- mean(predicted_class_test == actual_test)
cat("测试集准确率:", round(accuracy_test, 4), "\n")
## 测试集准确率: 0.847
# 计算并绘制训练集和测试集的ROC曲线及AUC
roc_train <- roc(response = actual_train, predictor = predicted_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_train <- auc(roc_train)
roc_test <- roc(response = actual_test, predictor = predicted_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_test <- auc(roc_test)
plot(roc_train, col = "blue", lwd = 2, main = "训练集与测试集 ROC 曲线")
plot(roc_test, col = "red", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
paste("测试集 AUC =", round(auc_test, 4))),
col = c("blue", "red"), lwd = 2, bty = "n")

# 提取前三个主成分,构建数据框并添加标签
pca_df <- as.data.frame(pca_variable$x[, 1:3])
colnames(pca_df) <- c("PC1", "PC2", "PC3")
pca_df$rainfall <- as.factor(rainfall)
# 划分训练集(80%)和测试集(20%)
set.seed(111)
n <- nrow(pca_df)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- pca_df[train_index, ]
test_data <- pca_df[-train_index, ]
# 基于前三主成分训练逻辑回归模型
logit_pca_model <- glm(rainfall ~ PC1 + PC2 + PC3, data = train_data, family = binomial)
# 训练集预测及分类
predicted_prob_train <- predict(logit_pca_model, newdata = train_data, type = "response")
predicted_class_train <- ifelse(predicted_prob_train > 0.5, 1, 0)
actual_train <- as.numeric(as.character(train_data$rainfall))
# 测试集预测及分类
predicted_prob_test <- predict(logit_pca_model, newdata = test_data, type = "response")
predicted_class_test <- ifelse(predicted_prob_test > 0.5, 1, 0)
actual_test <- as.numeric(as.character(test_data$rainfall))
# 计算并打印训练和测试集准确率
accuracy_train <- mean(predicted_class_train == actual_train)
cat("训练集准确率:", round(accuracy_train, 4), "\n")
## 训练集准确率: 0.8704
accuracy_test <- mean(predicted_class_test == actual_test)
cat("测试集准确率:", round(accuracy_test, 4), "\n")
## 测试集准确率: 0.8379
# 打印混淆矩阵
cat("训练集混淆矩阵:\n")
## 训练集混淆矩阵:
print(table(Predicted = predicted_class_train, Actual = actual_train))
## Actual
## Predicted 0 1
## 0 280 80
## 1 147 1245
cat("测试集混淆矩阵:\n")
## 测试集混淆矩阵:
print(table(Predicted = predicted_class_test, Actual = actual_test))
## Actual
## Predicted 0 1
## 0 76 34
## 1 37 291
# 计算训练和测试集ROC曲线及AUC
roc_train <- roc(response = actual_train, predictor = predicted_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_train <- auc(roc_train)
roc_test <- roc(response = actual_test, predictor = predicted_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_test <- auc(roc_test)
# 绘制ROC曲线
plot(roc_train, col = "darkgreen", lwd = 2, main = "PCA逻辑回归 ROC 曲线")
plot(roc_test, col = "orange", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
paste("测试集 AUC =", round(auc_test, 4))),
col = c("darkgreen", "orange"), lwd = 2, bty = "n")

set.seed(123)
n_repeats <- 30 # 重复实验次数
n <- nrow(pca_df)
# 初始化结果存储向量
train_accs <- numeric(n_repeats)
test_accs <- numeric(n_repeats)
train_aucs <- numeric(n_repeats)
test_aucs <- numeric(n_repeats)
for (i in 1:n_repeats) {
# 随机划分训练集和测试集
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- pca_df[train_index, ]
test_data <- pca_df[-train_index, ]
# 拟合逻辑回归模型
model <- glm(rainfall ~ PC1 + PC2 + PC3, data = train_data, family = binomial)
# 训练集预测及评估
prob_train <- predict(model, newdata = train_data, type = "response")
class_train <- ifelse(prob_train > 0.5, 1, 0)
actual_train <- as.numeric(as.character(train_data$rainfall))
train_accs[i] <- mean(class_train == actual_train)
train_aucs[i] <- auc(roc(response = actual_train, predictor = prob_train))
# 测试集预测及评估
prob_test <- predict(model, newdata = test_data, type = "response")
class_test <- ifelse(prob_test > 0.5, 1, 0)
actual_test <- as.numeric(as.character(test_data$rainfall))
test_accs[i] <- mean(class_test == actual_test)
test_aucs[i] <- auc(roc(response = actual_test, predictor = prob_test))
}
# 输出训练集和测试集准确率及AUC的均值和标准差
cat("训练集准确率均值:", round(mean(train_accs), 4), " 标准差:", round(sd(train_accs), 4), "\n")
## 训练集准确率均值: 0.8652 标准差: 0.0042
cat("测试集准确率均值:", round(mean(test_accs), 4), " 标准差:", round(sd(test_accs), 4), "\n")
## 测试集准确率均值: 0.8625 标准差: 0.0155
cat("训练集 AUC 均值:", round(mean(train_aucs), 4), " 标准差:", round(sd(train_aucs), 4), "\n")
## 训练集 AUC 均值: 0.8934 标准差: 0.0044
cat("测试集 AUC 均值:", round(mean(test_aucs), 4), " 标准差:", round(sd(test_aucs), 4), "\n")
## 测试集 AUC 均值: 0.8867 标准差: 0.0184
set.seed(111)
# 划分训练集(80%)和测试集(20%)
n <- nrow(predictors_scaled)
train_index <- sample(1:n, size = round(0.8 * n))
test_index <- setdiff(1:n, train_index)
# 提取训练集和测试集的自变量和因变量
train_predictors <- predictors_scaled[train_index, ]
test_predictors <- predictors_scaled[test_index, ]
train_rainfall <- rainfall[train_index]
test_rainfall <- rainfall[test_index]
# 在训练集上做因子分析,提取两个因子,旋转方式为varimax
fa_train <- fa(train_predictors, nfactors = 2, rotate = "varimax", fm = "ml")
train_scores <- as.data.frame(fa_train$scores[, 1:2])
names(train_scores) <- c("fa_Factor1", "fa_Factor2")
train_scores$rainfall <- as.factor(train_rainfall)
# 用训练集因子得分拟合逻辑回归模型
logit_fa_model <- glm(rainfall ~ fa_Factor1 + fa_Factor2, data = train_scores, family = binomial)
# 训练集预测和评估
prob_train <- predict(logit_fa_model, type = "response")
class_train <- ifelse(prob_train > 0.5, 1, 0)
actual_train <- as.numeric(as.character(train_rainfall))
train_accuracy <- mean(class_train == actual_train)
train_auc <- auc(roc(response = actual_train, predictor = prob_train))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# 用训练集载荷对测试集数据做因子得分
test_scores <- as.data.frame(scale(test_predictors) %*% fa_train$loadings[, 1:2])
names(test_scores) <- c("fa_Factor1", "fa_Factor2")
test_scores$rainfall <- as.factor(test_rainfall)
# 测试集预测和评估
prob_test <- predict(logit_fa_model, newdata = test_scores, type = "response")
class_test <- ifelse(prob_test > 0.5, 1, 0)
actual_test <- as.numeric(as.character(test_rainfall))
test_accuracy <- mean(class_test == actual_test)
test_auc <- auc(roc(response = actual_test, predictor = prob_test))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# 绘制训练集和测试集ROC曲线
roc_train <- roc(response = actual_train, predictor = prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_test <- roc(response = actual_test, predictor = prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_train, col = "darkgreen", lwd = 2, main = "因子分析逻辑回归:训练集与测试集 ROC 曲线")
lines(roc_test, col = "darkred", lwd = 2)
legend("bottomright",
legend = c(paste("训练集 AUC =", round(train_auc, 4)),
paste("测试集 AUC =", round(test_auc, 4))),
col = c("darkgreen", "darkred"), lwd = 2, bty = "n")

# 输出准确率和AUC
cat("训练集准确率:", round(train_accuracy, 4), "\n")
## 训练集准确率: 0.8733
cat("训练集 AUC:", round(train_auc, 4), "\n")
## 训练集 AUC: 0.8993
cat("测试集准确率:", round(test_accuracy, 4), "\n")
## 测试集准确率: 0.8333
cat("测试集 AUC:", round(test_auc, 4), "\n")
## 测试集 AUC: 0.8563
SVM
# 构建数据框(标准化变量 + 标签)
logit_data <- as.data.frame(predictors_scaled)
logit_data$rainfall <- as.factor(rainfall)
# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(logit_data)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- logit_data[train_index, ]
test_data <- logit_data[-train_index, ]
# 训练逻辑回归模型
logit_model <- glm(rainfall ~ ., data = train_data, family = binomial)
# 训练集预测及评估
cat("\n=== 训练集性能 ===\n")
##
## === 训练集性能 ===
predicted_prob_train <- predict(logit_model, newdata = train_data, type = "response")
actual_train <- as.numeric(as.character(train_data$rainfall))
predicted_class_train <- ifelse(predicted_prob_train > 0.5, 1, 0)
confusion_train <- confusionMatrix(as.factor(predicted_class_train), as.factor(actual_train), positive = "1")
print(confusion_train$table)
## Reference
## Prediction 0 1
## 0 288 77
## 1 147 1240
cat("训练集准确率:", round(confusion_train$overall["Accuracy"], 4), "\n")
## 训练集准确率: 0.8721
cat("训练集 AUC:", round(auc(roc(response = actual_train, predictor = predicted_prob_train)), 4), "\n")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 训练集 AUC: 0.9016
cat("训练集分类指标(精确率/召回率/F1):\n")
## 训练集分类指标(精确率/召回率/F1):
print(confusion_train$byClass[c("Precision", "Recall", "F1")])
## Precision Recall F1
## 0.8940159 0.9415338 0.9171598
# 绘制训练集ROC曲线
roc_train <- roc(response = actual_train, predictor = predicted_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_train, col = "blue", lwd = 2, main = "训练集与测试集 ROC 曲线")
# 测试集预测及评估
cat("\n=== 测试集性能 ===\n")
##
## === 测试集性能 ===
predicted_prob_test <- predict(logit_model, newdata = test_data, type = "response")
actual_test <- as.numeric(as.character(test_data$rainfall))
predicted_class_test <- ifelse(predicted_prob_test > 0.5, 1, 0)
confusion_test <- confusionMatrix(as.factor(predicted_class_test), as.factor(actual_test), positive = "1")
print(confusion_test$table)
## Reference
## Prediction 0 1
## 0 65 27
## 1 40 306
cat("测试集准确率:", round(confusion_test$overall["Accuracy"], 4), "\n")
## 测试集准确率: 0.847
cat("测试集 AUC:", round(auc(roc(response = actual_test, predictor = predicted_prob_test)), 4), "\n")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 测试集 AUC: 0.8782
cat("测试集分类指标(精确率/召回率/F1):\n")
## 测试集分类指标(精确率/召回率/F1):
print(confusion_test$byClass[c("Precision", "Recall", "F1")])
## Precision Recall F1
## 0.8843931 0.9189189 0.9013255
# 绘制测试集ROC曲线(叠加在训练集曲线上)
roc_test <- roc(response = actual_test, predictor = predicted_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_test, col = "red", lwd = 2, add = TRUE)
# 添加图例
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc(roc_train), 4)),
paste("测试集 AUC =", round(auc(roc_test), 4))),
col = c("blue", "red"), lwd = 2, bty = "n")

# 准备数据框(前三个主成分 + 标签)
pca_df <- as.data.frame(pca_variable$x[, 1:3])
colnames(pca_df) <- c("PC1", "PC2", "PC3")
pca_df$rainfall <- as.factor(rainfall)
# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(pca_df)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- pca_df[train_index, ]
test_data <- pca_df[-train_index, ]
# 训练基于RBF核的SVM模型(支持概率输出)
svm_model <- svm(rainfall ~ ., data = train_data, kernel = "radial", probability = TRUE)
cat("\n=== 训练集性能 ===\n")
##
## === 训练集性能 ===
predicted_train <- predict(svm_model, newdata = train_data, probability = TRUE)
prob_train <- attr(predicted_train, "probabilities")[,2] # 正类概率
actual_train <- train_data$rainfall
confusion_train <- confusionMatrix(predicted_train, actual_train, positive = "1")
print(confusion_train$table)
## Reference
## Prediction 0 1
## 0 273 60
## 1 162 1257
cat("训练集准确率:", round(confusion_train$overall["Accuracy"], 4), "\n")
## 训练集准确率: 0.8733
cat("训练集分类指标(精确率/召回率/F1):\n")
## 训练集分类指标(精确率/召回率/F1):
print(confusion_train$byClass[c("Precision", "Recall", "F1")])
## Precision Recall F1
## 0.8858351 0.9544419 0.9188596
roc_train <- roc(response = as.numeric(as.character(actual_train)), predictor = prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_train <- auc(roc_train)
cat("训练集 AUC:", round(auc_train, 4), "\n")
## 训练集 AUC: 0.8535
plot(roc_train, col = "blue", lwd = 2, main = "基于 PCA 的 SVM:训练集与测试集 ROC 曲线")
cat("\n=== 测试集性能 ===\n")
##
## === 测试集性能 ===
predicted_test <- predict(svm_model, newdata = test_data, probability = TRUE)
prob_test <- attr(predicted_test, "probabilities")[,2]
actual_test <- test_data$rainfall
confusion_test <- confusionMatrix(predicted_test, actual_test, positive = "1")
print(confusion_test$table)
## Reference
## Prediction 0 1
## 0 59 24
## 1 46 309
cat("测试集准确率:", round(confusion_test$overall["Accuracy"], 4), "\n")
## 测试集准确率: 0.8402
cat("测试集分类指标(精确率/召回率/F1):\n")
## 测试集分类指标(精确率/召回率/F1):
print(confusion_test$byClass[c("Precision", "Recall", "F1")])
## Precision Recall F1
## 0.8704225 0.9279279 0.8982558
roc_test <- roc(response = as.numeric(as.character(actual_test)), predictor = prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_test <- auc(roc_test)
cat("测试集 AUC:", round(auc_test, 4), "\n")
## 测试集 AUC: 0.824
plot(roc_test, col = "red", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
paste("测试集 AUC =", round(auc_test, 4))),
col = c("blue", "red"), lwd = 2, bty = "n")

# 构建数据集(取前两个因子得分 + 标签)
factor_scores <- as.data.frame(fa_result$scores[, 1:2])
colnames(factor_scores) <- c("Factor1", "Factor2")
factor_scores$rainfall <- as.factor(rainfall)
# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(factor_scores)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- factor_scores[train_index, ]
test_data <- factor_scores[-train_index, ]
# 训练基于径向核的SVM模型(支持概率预测)
svm_model <- svm(rainfall ~ ., data = train_data, kernel = "radial", probability = TRUE)
cat("\n=== 训练集性能 ===\n")
##
## === 训练集性能 ===
predicted_train <- predict(svm_model, newdata = train_data, probability = TRUE)
prob_train <- attr(predicted_train, "probabilities")[,2]
actual_train <- train_data$rainfall
confusion_train <- confusionMatrix(predicted_train, actual_train, positive = "1")
print(confusion_train$table)
## Reference
## Prediction 0 1
## 0 275 61
## 1 160 1256
cat("训练集准确率:", round(confusion_train$overall["Accuracy"], 4), "\n")
## 训练集准确率: 0.8739
cat("训练集指标(精确率/召回率/F1):\n")
## 训练集指标(精确率/召回率/F1):
print(confusion_train$byClass[c("Precision", "Recall", "F1")])
## Precision Recall F1
## 0.8870056 0.9536826 0.9191365
roc_train <- roc(response = as.numeric(as.character(actual_train)), predictor = prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_train <- auc(roc_train)
cat("训练集 AUC:", round(auc_train, 4), "\n")
## 训练集 AUC: 0.8403
plot(roc_train, col = "blue", lwd = 2, main = "基于因子得分的 SVM:训练集与测试集 ROC 曲线")
cat("\n=== 测试集性能 ===\n")
##
## === 测试集性能 ===
predicted_test <- predict(svm_model, newdata = test_data, probability = TRUE)
prob_test <- attr(predicted_test, "probabilities")[,2]
actual_test <- test_data$rainfall
confusion_test <- confusionMatrix(predicted_test, actual_test, positive = "1")
print(confusion_test$table)
## Reference
## Prediction 0 1
## 0 58 26
## 1 47 307
cat("测试集准确率:", round(confusion_test$overall["Accuracy"], 4), "\n")
## 测试集准确率: 0.8333
cat("测试集指标(精确率/召回率/F1):\n")
## 测试集指标(精确率/召回率/F1):
print(confusion_test$byClass[c("Precision", "Recall", "F1")])
## Precision Recall F1
## 0.8672316 0.9219219 0.8937409
roc_test <- roc(response = as.numeric(as.character(actual_test)), predictor = prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_test <- auc(roc_test)
cat("测试集 AUC:", round(auc_test, 4), "\n")
## 测试集 AUC: 0.8332
plot(roc_test, col = "red", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
paste("测试集 AUC =", round(auc_test, 4))),
col = c("blue", "red"), lwd = 2, bty = "n")
